examine the data

First examine the data by loading it in with the %Load command It will fill in the block with the content of the file in the block and comment out the %Load command


In [ ]:
%load data/numbers.txt

examine the sorting code

Do not run the block a second time


In [ ]:
# %load code/MRSortByString.py
from mrjob.job import MRJob

class MRSortByString(MRJob):
    def mapper(self, _, line):
        """
        """
        l = line.split(' ')
        print l
        yield l[1], l[0]

    def reducer(self, key, val):
        yield key, [v for v in val][0]


if __name__ == '__main__':
    MRSortByString.run()

In [5]:
%run   code/MRSortByString.py data/numbers.txt


no configs found; falling back on auto-configuration
no configs found; falling back on auto-configuration
creating tmp directory c:\cygwin64\tmp\MRSortByString.PS.20171002.015754.754000

PLEASE NOTE: Starting in mrjob v0.5.0, protocols will be strict by default. It's recommended you run your job with --strict-protocols or set up mrjob.conf as described at https://pythonhosted.org/mrjob/whats-new.html#ready-for-strict-protocols

writing to c:\cygwin64\tmp\MRSortByString.PS.20171002.015754.754000\step-0-mapper_part-00000
Counters from step 1:
  (no counters found)
writing to c:\cygwin64\tmp\MRSortByString.PS.20171002.015754.754000\step-0-mapper-sorted
> sort 'c:\cygwin64\tmp\MRSortByString.PS.20171002.015754.754000\step-0-mapper_part-00000'
writing to c:\cygwin64\tmp\MRSortByString.PS.20171002.015754.754000\step-0-reducer_part-00000
Counters from step 1:
  (no counters found)
Moving c:\cygwin64\tmp\MRSortByString.PS.20171002.015754.754000\step-0-reducer_part-00000 -> c:\cygwin64\tmp\MRSortByString.PS.20171002.015754.754000\output\part-00000
Streaming final output from c:\cygwin64\tmp\MRSortByString.PS.20171002.015754.754000\output
['1', '10']
['2', '11']
['3', '3']
['4', '12']
['5', '4']
['6', '1']
['7', '1']
['8', '41']
['9', '532']
['10', '2']
['11', '0']
"0"	"11"
"1"	"6"
"10"	"1"
"11"	"2"
"12"	"4"
"2"	"10"
"3"	"3"
"4"	"5"
"41"	"8"
"532"	"9"
removing tmp directory c:\cygwin64\tmp\MRSortByString.PS.20171002.015754.754000

How were they sorted?


In [ ]:
# %load code/MRSortByInt.py
from mrjob.job import MRJob

class MRSortByInt(MRJob):
    def mapper(self, _, line):
        """
        """
        l = line.strip('\n').split()
        yield '%01d'%int(l[1]), l[0]

    def reducer(self, key, val):
        yield int(key), int(list(val)[0])


if __name__ == '__main__':
    MRSortByInt.run()

In [11]:
%run code/MRSortByInt.py data/numbers.txt


no configs found; falling back on auto-configuration
no configs found; falling back on auto-configuration
no configs found; falling back on auto-configuration
no configs found; falling back on auto-configuration
creating tmp directory c:\cygwin64\tmp\MRSortByInt.PS.20171002.020226.416000
creating tmp directory c:\cygwin64\tmp\MRSortByInt.PS.20171002.020226.416000


PLEASE NOTE: Starting in mrjob v0.5.0, protocols will be strict by default. It's recommended you run your job with --strict-protocols or set up mrjob.conf as described at https://pythonhosted.org/mrjob/whats-new.html#ready-for-strict-protocols
PLEASE NOTE: Starting in mrjob v0.5.0, protocols will be strict by default. It's recommended you run your job with --strict-protocols or set up mrjob.conf as described at https://pythonhosted.org/mrjob/whats-new.html#ready-for-strict-protocols


writing to c:\cygwin64\tmp\MRSortByInt.PS.20171002.020226.416000\step-0-mapper_part-00000
writing to c:\cygwin64\tmp\MRSortByInt.PS.20171002.020226.416000\step-0-mapper_part-00000
Counters from step 1:
Counters from step 1:
  (no counters found)
  (no counters found)
writing to c:\cygwin64\tmp\MRSortByInt.PS.20171002.020226.416000\step-0-mapper-sorted
writing to c:\cygwin64\tmp\MRSortByInt.PS.20171002.020226.416000\step-0-mapper-sorted
> sort 'c:\cygwin64\tmp\MRSortByInt.PS.20171002.020226.416000\step-0-mapper_part-00000'
> sort 'c:\cygwin64\tmp\MRSortByInt.PS.20171002.020226.416000\step-0-mapper_part-00000'
writing to c:\cygwin64\tmp\MRSortByInt.PS.20171002.020226.416000\step-0-reducer_part-00000
writing to c:\cygwin64\tmp\MRSortByInt.PS.20171002.020226.416000\step-0-reducer_part-00000
Counters from step 1:
Counters from step 1:
  (no counters found)
  (no counters found)
Moving c:\cygwin64\tmp\MRSortByInt.PS.20171002.020226.416000\step-0-reducer_part-00000 -> c:\cygwin64\tmp\MRSortByInt.PS.20171002.020226.416000\output\part-00000
Moving c:\cygwin64\tmp\MRSortByInt.PS.20171002.020226.416000\step-0-reducer_part-00000 -> c:\cygwin64\tmp\MRSortByInt.PS.20171002.020226.416000\output\part-00000
Streaming final output from c:\cygwin64\tmp\MRSortByInt.PS.20171002.020226.416000\output
Streaming final output from c:\cygwin64\tmp\MRSortByInt.PS.20171002.020226.416000\output
0	11
1	6
10	1
11	2
12	4
2	10
3	3
4	5
41	8
532	9
removing tmp directory c:\cygwin64\tmp\MRSortByInt.PS.20171002.020226.416000
removing tmp directory c:\cygwin64\tmp\MRSortByInt.PS.20171002.020226.416000

In [36]:
%%writefile data/sortdata.txt
1 1
2 4
3 8
4 2
4 7
5 5
6 10
7 11


Writing data/sortdata.txt

In [39]:
# Running code inline example

In [38]:
# -*- coding: utf-8 -*-
# Testing word frequency count
import os, sys
sys.path.append(os.path.join(os.getcwd(),"code"))
from MRSortByString import *
from mrjob.job import MRJob
'''
This is a simple wrapper that runs mrjob MapReduce jobs, the inputs are:
MRJobClass - the class of the job to be run
argsArr - an array of strings to be used when creating the MRJob.
@author: Peter Harrington  if you have any questions: peter.b.harrington@gmail.com
'''
def runJob(MRJobClass, argsArr, loc='local'):
    if loc == 'emr': 
        argsArr.extend(['-r', 'emr'])
    print "starting %s job on %s" % (MRJobClass.__name__, loc)
    mrJob = MRJobClass(args=argsArr)
    runner = mrJob.make_runner()
    runner.run()
    print "finished %s job" % MRJobClass.__name__
    return mrJob, runner
    
def runParallelJob(MRJobClass, argsArr):            #TO DO: add threading to allow jobs to run in 
    pass                                                #parallel 
    #launch a new thread
    #call runJob(MRJobClass, argsArr) on the new thread

if __name__ == '__main__':
# pass in file from outside
#    MRWordFrequencyCount.run()
#setup file here
    mr_job, runner = runJob(MRSortByString,[os.path.join(os.path.join(os.getcwd(),"data"),"sortdata.txt")],"local")
    print "Sorting sortdata.txt"
    for line in runner.stream_output(): 
        key, value = mr_job.parse_output_line(line)
        print "%s: %s "%(key,value)


no configs found; falling back on auto-configuration
no configs found; falling back on auto-configuration
no configs found; falling back on auto-configuration
no configs found; falling back on auto-configuration
creating tmp directory c:\cygwin64\tmp\MRSortByString.PS.20171002.023747.748000
creating tmp directory c:\cygwin64\tmp\MRSortByString.PS.20171002.023747.748000


PLEASE NOTE: Starting in mrjob v0.5.0, protocols will be strict by default. It's recommended you run your job with --strict-protocols or set up mrjob.conf as described at https://pythonhosted.org/mrjob/whats-new.html#ready-for-strict-protocols
PLEASE NOTE: Starting in mrjob v0.5.0, protocols will be strict by default. It's recommended you run your job with --strict-protocols or set up mrjob.conf as described at https://pythonhosted.org/mrjob/whats-new.html#ready-for-strict-protocols


writing to c:\cygwin64\tmp\MRSortByString.PS.20171002.023747.748000\step-0-mapper_part-00000
writing to c:\cygwin64\tmp\MRSortByString.PS.20171002.023747.748000\step-0-mapper_part-00000
Counters from step 1:
Counters from step 1:
  (no counters found)
  (no counters found)
writing to c:\cygwin64\tmp\MRSortByString.PS.20171002.023747.748000\step-0-mapper-sorted
writing to c:\cygwin64\tmp\MRSortByString.PS.20171002.023747.748000\step-0-mapper-sorted
> sort 'c:\cygwin64\tmp\MRSortByString.PS.20171002.023747.748000\step-0-mapper_part-00000'
> sort 'c:\cygwin64\tmp\MRSortByString.PS.20171002.023747.748000\step-0-mapper_part-00000'
writing to c:\cygwin64\tmp\MRSortByString.PS.20171002.023747.748000\step-0-reducer_part-00000
writing to c:\cygwin64\tmp\MRSortByString.PS.20171002.023747.748000\step-0-reducer_part-00000
Counters from step 1:
Counters from step 1:
  (no counters found)
  (no counters found)
Moving c:\cygwin64\tmp\MRSortByString.PS.20171002.023747.748000\step-0-reducer_part-00000 -> c:\cygwin64\tmp\MRSortByString.PS.20171002.023747.748000\output\part-00000
Moving c:\cygwin64\tmp\MRSortByString.PS.20171002.023747.748000\step-0-reducer_part-00000 -> c:\cygwin64\tmp\MRSortByString.PS.20171002.023747.748000\output\part-00000
Streaming final output from c:\cygwin64\tmp\MRSortByString.PS.20171002.023747.748000\output
Streaming final output from c:\cygwin64\tmp\MRSortByString.PS.20171002.023747.748000\output
starting MRSortByString job on local
['1', '1']
['2', '4']
['3', '8']
['4', '2']
['4', '7']
['5', '5']
['6', '10']
['7', '11']
finished MRSortByString job
Sorting sortdata.txt
1: 1 
10: 6 
11: 7 
2: 4 
4: 2 
5: 5 
7: 4 
8: 3 

Note the second column is reported by their string values